library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(ggplot2)
library(gridExtra)
library(ggridges)
library(adegenet)
## Loading required package: ade4
##
## /// adegenet 2.1.5 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
library(SNPfiltR)
## This is SNPfiltR v.1.0.0
##
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
##
## If you use SNPfiltR in your published work, please cite the following papers:
##
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618
##
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/hipposideros/n3.vcf")
### check the metadata present in your vcf
vcfR
vcfR@fix[1:10,1:8]
vcfR@gt[1:10,1:2]
#read in sample metadata
samps<-read.csv("~/Desktop/hipposideros/hipposideros.sampling.csv")
#retain samples that were sequenced successfully
samps<-samps[samps$id %in% colnames(vcfR@gt),]
samps$id == colnames(vcfR@gt)[-1] #check that order matches
#generate popmap file. Two column popmap with the same format as stacks, and the columns must be named 'id' and 'pop'
popmap<-data.frame(id=samps$id,
pop=paste(samps$species, samps$Island, sep = "_"))
table(popmap$pop)
#hard filter to minimum depth of 3, and minimum genotype quality of 30
vcfR<-hard_filter(vcfR, depth = 3, gq = 30)
## 0.02% of genotypes fall below a read depth of 3 and were converted to NA
## 7.81% of genotypes fall below a genotype quality of 30 and were converted to NA
#execute allele balance filter
vcfR<-filter_allele_balance(vcfR)
## 11.48% of het genotypes (0.66% of all genotypes) fall outside of 0.25 - 0.75 allele balance ratio and were converted to NA
#visualize and pick appropriate max depth cutoff
max_depth(vcfR)
## cutoff is not specified, exploratory visualization will be generated.
## dashed line indicates a mean depth across all SNPs of 34.9
#filter vcf by the max depth cutoff you chose
vcfR<-max_depth(vcfR, maxdepth = 100)
## maxdepth cutoff is specified, filtered vcfR object will be returned
## 2.43% of SNPs were above a mean depth of 100 and were removed from the vcf
#check vcfR to see how many SNPs we have left
vcfR
## ***** Object of Class vcfR *****
## 33 samples
## 55362 CHROMs
## 147,323 variants
## Object size: 222.9 Mb
## 45.94 percent missing data
## ***** ***** *****
#run function to visualize samples
miss<-missing_by_sample(vcfR=vcfR)
## No popmap provided
#run function to drop samples above the threshold we want from the vcf
#setting it conservatively for now
vcfR<-missing_by_sample(vcfR=vcfR, cutoff = .9)
## 1 samples are above a 0.9 missing data cutoff, and were removed from VCF
#subset popmap to only include retained individuals
popmap<-popmap[popmap$id %in% colnames(vcfR@gt),]
#alternatively, you can drop individuals from vcfR manually using the following syntax, if a strict cutoff doesn't work for your dataset
#vcfR@gt <- vcfR@gt[,colnames(vcfR@gt) != "KVO248_H_dinops_Isabel"]
#drop invariant sites plus singletons for plotting
vcf.filt<-min_mac(vcfR, min.mac = 2)
## 55.97% of SNPs fell below a minor allele count of 2 and were removed from the VCF
#assess the effect of missing data on clustering inferences
assess_missing_data_tsne(vcf.filt, popmap = popmap, thresholds =c(.7,.8,.9, 1), clustering = FALSE)
## cutoff is specified, filtered vcfR object will be returned
## 47.19% of SNPs fell below a completeness cutoff of 0.7 and were removed from the VCF
## cutoff is specified, filtered vcfR object will be returned
## 59.93% of SNPs fell below a completeness cutoff of 0.8 and were removed from the VCF
## cutoff is specified, filtered vcfR object will be returned
## 82.68% of SNPs fell below a completeness cutoff of 0.9 and were removed from the VCF
## cutoff is specified, filtered vcfR object will be returned
## 99.46% of SNPs fell below a completeness cutoff of 1 and were removed from the VCF
## [[1]]
## V1 V2 pop missing
## 1 -16.744456 38.05600626 diadema_Gatokae 0.194534466
## 2 18.429990 30.35880386 dinops_Guadalcanal 0.743919886
## 3 48.847937 78.03315228 diadema_Guadalcanal 0.008233336
## 4 57.445403 79.91380439 diadema_Guadalcanal 0.018072465
## 5 71.605100 -83.62683122 dinops_Guadalcanal 0.017605325
## 6 79.694850 -79.16750042 dinops_Guadalcanal 0.158243555
## 7 73.788514 -72.39200762 dinops_Guadalcanal 0.053399901
## 8 58.840082 71.21247151 diadema_Guadalcanal 0.009167616
## 9 50.093631 69.19411417 diadema_Guadalcanal 0.058947184
## 10 -6.287859 22.56222630 diadema_Isabel 0.395229336
## 11 -16.119427 21.57419760 diadema_Isabel 0.137660214
## 12 -23.032231 26.49747140 diadema_Isabel 0.043589968
## 13 -8.776985 30.13336524 diadema_Isabel 0.383434060
## 14 4.358041 26.74448750 diadema_Isabel 0.760999679
## 15 70.310253 -8.64102199 dinops_Rendova 0.016437476
## 16 70.682287 0.26588171 dinops_Rendova 0.010773408
## 17 22.874371 18.13162684 dinops_Rendova 0.676885346
## 18 79.592881 0.02234689 dinops_Rendova 0.006977898
## 19 79.233443 -8.88428540 dinops_Rendova 0.007153076
## 20 -19.066359 -14.71036261 diadema_Rendova 0.017021401
## 21 -17.760806 -21.28675901 demissus_Makira 0.054684535
## 22 25.227486 34.32396583 dinops_Guadalcanal 0.618492891
## 23 66.118207 -76.74591245 dinops_Guadalcanal 0.032728971
## 24 -120.786111 -41.52728831 diadema_Ngella Islands 0.008817261
## 25 -138.359281 -36.44430848 diadema_Ngella Islands 0.025196345
## 26 -132.019279 -27.04641869 diadema_Ngella Islands 0.010860997
## 27 -121.148636 -30.21588551 diadema_Ngella Islands 0.014539721
## 28 -131.453467 -45.40259541 diadema_Ngella Islands 0.009576363
## 29 -128.739874 -36.12114582 diadema_Ngella Islands 0.014422937
## 30 -6.882223 -17.96834825 diadema_New Britain 0.033750839
## 31 4.239172 38.66812842 diadema_PNG 0.117252051
## 32 5.795346 14.48862103 diadema_PNG 0.090917054
##
## [[2]]
## V1 V2 pop missing
## 1 52.923162 -25.77476 diadema_Gatokae 0.120503251
## 2 10.094502 -60.07449 dinops_Guadalcanal 0.686583817
## 3 -58.042845 -84.63555 diadema_Guadalcanal 0.004193759
## 4 -74.006172 -82.73297 diadema_Guadalcanal 0.006579200
## 5 -38.721031 190.74533 dinops_Guadalcanal 0.015120619
## 6 -50.215902 202.40456 dinops_Guadalcanal 0.083605864
## 7 -26.457070 201.39452 dinops_Guadalcanal 0.017236736
## 8 -75.802445 -98.77197 diadema_Guadalcanal 0.004847832
## 9 -59.529833 -100.62589 diadema_Guadalcanal 0.024470009
## 10 50.818225 -46.60589 diadema_Isabel 0.292062637
## 11 70.020190 -56.05669 diadema_Isabel 0.068485245
## 12 71.433186 -40.44202 diadema_Isabel 0.016582663
## 13 54.069387 -61.54989 diadema_Isabel 0.283867493
## 14 34.366005 -59.85504 diadema_Isabel 0.710322804
## 15 148.853156 -92.91231 dinops_Rendova 0.007194798
## 16 161.684605 -102.88627 dinops_Rendova 0.005001731
## 17 21.460383 -81.65200 dinops_Rendova 0.646800816
## 18 171.621896 -89.98867 dinops_Rendova 0.003539687
## 19 158.775434 -80.01458 dinops_Rendova 0.003308838
## 20 29.739722 16.51303 diadema_Rendova 0.010118887
## 21 26.359527 28.31122 demissus_Makira 0.023277288
## 22 -2.432004 -54.97074 dinops_Guadalcanal 0.541264284
## 23 -37.691403 213.10623 dinops_Guadalcanal 0.012850602
## 24 -147.817391 45.60362 diadema_Ngella Islands 0.003962910
## 25 -152.405896 62.33438 diadema_Ngella Islands 0.009003116
## 26 -132.583462 55.06326 diadema_Ngella Islands 0.005117156
## 27 -154.888937 29.17433 diadema_Ngella Islands 0.005309530
## 28 -134.937920 33.97661 diadema_Ngella Islands 0.006617675
## 29 -165.304868 46.54382 diadema_Ngella Islands 0.005232581
## 30 45.702013 35.01273 diadema_New Britain 0.018814205
## 31 105.330469 25.03939 diadema_PNG 0.062136894
## 32 97.585316 34.32671 diadema_PNG 0.045669655
##
## [[3]]
## V1 V2 pop missing
## 1 -84.78334522 4.612278 diadema_Gatokae 0.0326686844
## 2 -0.04739858 40.700966 dinops_Guadalcanal 0.4955492256
## 3 49.43361929 35.994807 diadema_Guadalcanal 0.0004450774
## 4 31.77680174 36.150176 diadema_Guadalcanal 0.0008011394
## 5 23.71630897 127.674983 dinops_Guadalcanal 0.0102367812
## 6 24.78776058 146.619844 dinops_Guadalcanal 0.0141534627
## 7 14.98025419 137.851107 dinops_Guadalcanal 0.0037386505
## 8 40.81415890 46.631798 diadema_Guadalcanal 0.0027594801
## 9 40.35902764 25.935140 diadema_Guadalcanal 0.0056079758
## 10 -55.12534500 -7.485090 diadema_Isabel 0.1104682215
## 11 -60.09763668 -20.060000 diadema_Isabel 0.0144205092
## 12 -72.99821752 -17.015104 diadema_Isabel 0.0032935731
## 13 -67.47681654 -4.101519 diadema_Isabel 0.1219512195
## 14 -55.04415941 10.436555 diadema_Isabel 0.5354281645
## 15 -74.42552695 -102.535283 dinops_Rendova 0.0009791704
## 16 -68.06060045 -114.266561 dinops_Rendova 0.0010681859
## 17 -24.34461207 30.244245 dinops_Rendova 0.5198504540
## 18 -86.14843617 -109.039505 dinops_Rendova 0.0011572014
## 19 -79.73562302 -120.773303 dinops_Rendova 0.0007121239
## 20 -12.40629570 -23.569686 diadema_Rendova 0.0048068364
## 21 -4.21590287 -17.760865 demissus_Makira 0.0089905644
## 22 8.40246607 49.030204 dinops_Guadalcanal 0.3154708919
## 23 34.01477791 137.071454 dinops_Guadalcanal 0.0016022788
## 24 83.36004549 -47.430174 diadema_Ngella Islands 0.0008011394
## 25 78.49711589 -13.882633 diadema_Ngella Islands 0.0022253872
## 26 95.18219781 -39.376159 diadema_Ngella Islands 0.0011572014
## 27 91.27376272 -10.974047 diadema_Ngella Islands 0.0016912943
## 28 78.70366989 -34.142018 diadema_Ngella Islands 0.0030265266
## 29 90.12527268 -24.214715 diadema_Ngella Islands 0.0010681859
## 30 -55.91882762 26.336683 diadema_New Britain 0.0097026883
## 31 3.92000199 -71.169603 diadema_PNG 0.0210076553
## 32 11.48150201 -77.493976 diadema_PNG 0.0155777105
##
## [[4]]
## V1 V2 pop missing
## 1 42.868879 -2.552009 diadema_Gatokae 0
## 2 50.004070 -47.005091 dinops_Guadalcanal 0
## 3 -56.447556 63.749656 diadema_Guadalcanal 0
## 4 -71.238425 65.742178 diadema_Guadalcanal 0
## 5 -17.081721 -114.777664 dinops_Guadalcanal 0
## 6 -32.709819 -126.158157 dinops_Guadalcanal 0
## 7 13.741045 -97.804526 dinops_Guadalcanal 0
## 8 21.425413 67.600290 diadema_Guadalcanal 0
## 9 -70.049316 50.288752 diadema_Guadalcanal 0
## 10 32.705080 -50.596141 diadema_Isabel 0
## 11 48.898130 -75.104117 diadema_Isabel 0
## 12 280.718725 104.365502 diadema_Isabel 0
## 13 29.795939 -90.804467 diadema_Isabel 0
## 14 -29.693643 -111.982788 diadema_Isabel 0
## 15 59.358252 -61.364167 dinops_Rendova 0
## 16 44.164650 -60.958850 dinops_Rendova 0
## 17 -64.799692 135.149860 dinops_Rendova 0
## 18 44.475828 -20.933483 dinops_Rendova 0
## 19 -7.091263 28.014397 dinops_Rendova 0
## 20 44.167834 13.620316 diadema_Rendova 0
## 21 -48.830222 -114.846161 demissus_Makira 0
## 22 31.024222 -67.460580 dinops_Guadalcanal 0
## 23 -45.464135 -132.189293 dinops_Guadalcanal 0
## 24 -7.406134 71.752369 diadema_Ngella Islands 0
## 25 6.191084 36.949818 diadema_Ngella Islands 0
## 26 -20.380579 35.598758 diadema_Ngella Islands 0
## 27 4.859013 59.698323 diadema_Ngella Islands 0
## 28 -6.291670 44.305438 diadema_Ngella Islands 0
## 29 -19.460552 50.683255 diadema_Ngella Islands 0
## 30 -97.673627 107.587413 diadema_New Britain 0
## 31 -75.773513 123.321177 diadema_PNG 0
## 32 -84.006299 116.109990 diadema_PNG 0
#do we need to drop samples more aggressively?
#which filtering threshold adequately removes missing data to allow objective clustering?
#Samples at the high end of missing data cutoffs are not clustering correctly and dragging this whole enterprise down
#let's set a better cutoff
vcfR<-missing_by_sample(vcfR=vcfR, cutoff = .75)
## 4 samples are above a 0.75 missing data cutoff, and were removed from VCF
#visualize missing data by SNP and the effect of various cutoffs on the missingness of each sample
missing_by_snp(vcfR)
## cutoff is not specified, exploratory visualizations will be generated
## Picking joint bandwidth of 0.0675
## filt missingness snps.retained
## 1 0.30 0.18533212 105695
## 2 0.50 0.13838845 95162
## 3 0.60 0.11434738 88605
## 4 0.65 0.09821311 83421
## 5 0.70 0.08990074 80426
## 6 0.75 0.08110860 76970
## 7 0.80 0.06297955 68647
## 8 0.85 0.05344274 63415
## 9 0.90 0.03132406 48045
## 10 0.95 0.01745598 35700
## 11 1.00 0.00000000 18251
#choose a value that retains an acceptable amount of missing data in each sample, and maximizes SNPs retained while minimizing overall missing data, and filter vcf
vcfR<-missing_by_snp(vcfR, cutoff = .9)
## cutoff is specified, filtered vcfR object will be returned
## 67.39% of SNPs fell below a completeness cutoff of 0.9 and were removed from the VCF
check once more to see how many SNPs and individuals remain compared to our original, unfiltered vcf
#use this function to remove loci that have become invariant during our filtering procedure plus singletons which don't contribute to phylogenetic signal
vcfR<-min_mac(vcfR, min.mac = 2)
## 64.14% of SNPs fell below a minor allele count of 2 and were removed from the VCF
#check the number of loci retained
vcfR
## ***** Object of Class vcfR *****
## 28 samples
## 10598 CHROMs
## 17,228 variants
## Object size: 43.3 Mb
## 3.27 percent missing data
## ***** ***** *****
#plot depth per snp and per sample
dp <- extract.gt(vcfR, element = "DP", as.numeric=TRUE)
heatmap.bp(dp, rlabels = FALSE)
#plot genotype quality per snp and per sample
gq <- extract.gt(vcfR, element = "GQ", as.numeric=TRUE)
heatmap.bp(gq, rlabels = FALSE)
#write out the filtered vcf file
#write.vcf(vcfR, file = "~/Desktop/hipposideros/filtered.vcf.gz")
#filter to one SNP per locus to get an unlinked dataset
unlinked.vcf<-distance_thin(vcfR, min.distance = 10000)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================ | 24%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================= | 34%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|============================== | 44%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================ | 64%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|=================================================== | 74%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|========================================================== | 84%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================= | 94%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
## 10598 out of 17228 input SNPs were not located within 10000 base-pairs of another SNP and were retained despite filtering
#write out vcf file of unlinked SNPs
#write.vcf(unlinked.vcf, file = "~/Desktop/hipposideros/unlinked.filtered.vcf.gz")